Importing Libraries¶

In [150]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from warnings import filterwarnings
filterwarnings('ignore')
Training Dataset¶
In [151]:
df1 = pd.read_csv('C:/Users/purva/Downloads/ML_projects/House Pricing Project/training_set.csv')
Testing Dataset¶
In [152]:
df2 = pd.read_csv('C:/Users/purva/Downloads/ML_projects/House Pricing Project/testing_set.csv')
In [153]:
df1.describe()
Out[153]:
Id MSSubClass LotFrontage LotArea OverallQual OverallCond YearBuilt YearRemodAdd MasVnrArea BsmtFinSF1 ... WoodDeckSF OpenPorchSF EnclosedPorch 3SsnPorch ScreenPorch PoolArea MiscVal MoSold YrSold SalePrice
count 1460.000000 1460.000000 1201.000000 1460.000000 1460.000000 1460.000000 1460.000000 1460.000000 1452.000000 1460.000000 ... 1460.000000 1460.000000 1460.000000 1460.000000 1460.000000 1460.000000 1460.000000 1460.000000 1460.000000 1460.000000
mean 730.500000 56.897260 70.049958 10516.828082 6.099315 5.575342 1971.267808 1984.865753 103.685262 443.639726 ... 94.244521 46.660274 21.954110 3.409589 15.060959 2.758904 43.489041 6.321918 2007.815753 180921.195890
std 421.610009 42.300571 24.284752 9981.264932 1.382997 1.112799 30.202904 20.645407 181.066207 456.098091 ... 125.338794 66.256028 61.119149 29.317331 55.757415 40.177307 496.123024 2.703626 1.328095 79442.502883
min 1.000000 20.000000 21.000000 1300.000000 1.000000 1.000000 1872.000000 1950.000000 0.000000 0.000000 ... 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 1.000000 2006.000000 34900.000000
25% 365.750000 20.000000 59.000000 7553.500000 5.000000 5.000000 1954.000000 1967.000000 0.000000 0.000000 ... 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 5.000000 2007.000000 129975.000000
50% 730.500000 50.000000 69.000000 9478.500000 6.000000 5.000000 1973.000000 1994.000000 0.000000 383.500000 ... 0.000000 25.000000 0.000000 0.000000 0.000000 0.000000 0.000000 6.000000 2008.000000 163000.000000
75% 1095.250000 70.000000 80.000000 11601.500000 7.000000 6.000000 2000.000000 2004.000000 166.000000 712.250000 ... 168.000000 68.000000 0.000000 0.000000 0.000000 0.000000 0.000000 8.000000 2009.000000 214000.000000
max 1460.000000 190.000000 313.000000 215245.000000 10.000000 9.000000 2010.000000 2010.000000 1600.000000 5644.000000 ... 857.000000 547.000000 552.000000 508.000000 480.000000 738.000000 15500.000000 12.000000 2010.000000 755000.000000

8 rows × 38 columns

In [154]:
df1.info
Out[154]:
<bound method DataFrame.info of         Id  MSSubClass MSZoning  LotFrontage  LotArea Street Alley LotShape  \
0        1          60       RL         65.0     8450   Pave   NaN      Reg   
1        2          20       RL         80.0     9600   Pave   NaN      Reg   
2        3          60       RL         68.0    11250   Pave   NaN      IR1   
3        4          70       RL         60.0     9550   Pave   NaN      IR1   
4        5          60       RL         84.0    14260   Pave   NaN      IR1   
...    ...         ...      ...          ...      ...    ...   ...      ...   
1455  1456          60       RL         62.0     7917   Pave   NaN      Reg   
1456  1457          20       RL         85.0    13175   Pave   NaN      Reg   
1457  1458          70       RL         66.0     9042   Pave   NaN      Reg   
1458  1459          20       RL         68.0     9717   Pave   NaN      Reg   
1459  1460          20       RL         75.0     9937   Pave   NaN      Reg   

     LandContour Utilities  ... PoolArea PoolQC  Fence MiscFeature MiscVal  \
0            Lvl    AllPub  ...        0    NaN    NaN         NaN       0   
1            Lvl    AllPub  ...        0    NaN    NaN         NaN       0   
2            Lvl    AllPub  ...        0    NaN    NaN         NaN       0   
3            Lvl    AllPub  ...        0    NaN    NaN         NaN       0   
4            Lvl    AllPub  ...        0    NaN    NaN         NaN       0   
...          ...       ...  ...      ...    ...    ...         ...     ...   
1455         Lvl    AllPub  ...        0    NaN    NaN         NaN       0   
1456         Lvl    AllPub  ...        0    NaN  MnPrv         NaN       0   
1457         Lvl    AllPub  ...        0    NaN  GdPrv        Shed    2500   
1458         Lvl    AllPub  ...        0    NaN    NaN         NaN       0   
1459         Lvl    AllPub  ...        0    NaN    NaN         NaN       0   

     MoSold YrSold  SaleType  SaleCondition  SalePrice  
0         2   2008        WD         Normal     208500  
1         5   2007        WD         Normal     181500  
2         9   2008        WD         Normal     223500  
3         2   2006        WD        Abnorml     140000  
4        12   2008        WD         Normal     250000  
...     ...    ...       ...            ...        ...  
1455      8   2007        WD         Normal     175000  
1456      2   2010        WD         Normal     210000  
1457      5   2010        WD         Normal     266500  
1458      4   2010        WD         Normal     142125  
1459      6   2008        WD         Normal     147500  

[1460 rows x 81 columns]>
In [155]:
df1.columns
Out[155]:
Index(['Id', 'MSSubClass', 'MSZoning', 'LotFrontage', 'LotArea', 'Street',
       'Alley', 'LotShape', 'LandContour', 'Utilities', 'LotConfig',
       'LandSlope', 'Neighborhood', 'Condition1', 'Condition2', 'BldgType',
       'HouseStyle', 'OverallQual', 'OverallCond', 'YearBuilt', 'YearRemodAdd',
       'RoofStyle', 'RoofMatl', 'Exterior1st', 'Exterior2nd', 'MasVnrType',
       'MasVnrArea', 'ExterQual', 'ExterCond', 'Foundation', 'BsmtQual',
       'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinSF1',
       'BsmtFinType2', 'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF', 'Heating',
       'HeatingQC', 'CentralAir', 'Electrical', '1stFlrSF', '2ndFlrSF',
       'LowQualFinSF', 'GrLivArea', 'BsmtFullBath', 'BsmtHalfBath', 'FullBath',
       'HalfBath', 'BedroomAbvGr', 'KitchenAbvGr', 'KitchenQual',
       'TotRmsAbvGrd', 'Functional', 'Fireplaces', 'FireplaceQu', 'GarageType',
       'GarageYrBlt', 'GarageFinish', 'GarageCars', 'GarageArea', 'GarageQual',
       'GarageCond', 'PavedDrive', 'WoodDeckSF', 'OpenPorchSF',
       'EnclosedPorch', '3SsnPorch', 'ScreenPorch', 'PoolArea', 'PoolQC',
       'Fence', 'MiscFeature', 'MiscVal', 'MoSold', 'YrSold', 'SaleType',
       'SaleCondition', 'SalePrice'],
      dtype='object')
In [156]:
df1.shape
Out[156]:
(1460, 81)

Null treatment¶

In [157]:
plt.figure(figsize=(15,6))
df1.isna().sum().plot(kind='bar')
Out[157]:
<Axes: >
In [158]:
for i in df1.columns:
        if df1[i].dtypes==object:
            df1[i]=df1[i].fillna(df1[i].mode()[0])
        else:
            df1[i]=df1[i].fillna(df1[i].mean())
In [159]:
plt.figure(figsize=(15,6))
df1.isna().sum().plot(kind='bar')
Out[159]:
<Axes: >

EDA¶

In [160]:
cat=[]
con=[]
for i in df1.columns:
    if df1[i].dtypes==object:
        cat.append(i)
    else:
        con.append(i)
print('categorical features :',cat)
print('-------------------------------')
print('continues features :',con)
categorical features : ['MSZoning', 'Street', 'Alley', 'LotShape', 'LandContour', 'Utilities', 'LotConfig', 'LandSlope', 'Neighborhood', 'Condition1', 'Condition2', 'BldgType', 'HouseStyle', 'RoofStyle', 'RoofMatl', 'Exterior1st', 'Exterior2nd', 'MasVnrType', 'ExterQual', 'ExterCond', 'Foundation', 'BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2', 'Heating', 'HeatingQC', 'CentralAir', 'Electrical', 'KitchenQual', 'Functional', 'FireplaceQu', 'GarageType', 'GarageFinish', 'GarageQual', 'GarageCond', 'PavedDrive', 'PoolQC', 'Fence', 'MiscFeature', 'SaleType', 'SaleCondition']
-------------------------------
continues features : ['Id', 'MSSubClass', 'LotFrontage', 'LotArea', 'OverallQual', 'OverallCond', 'YearBuilt', 'YearRemodAdd', 'MasVnrArea', 'BsmtFinSF1', 'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF', '1stFlrSF', '2ndFlrSF', 'LowQualFinSF', 'GrLivArea', 'BsmtFullBath', 'BsmtHalfBath', 'FullBath', 'HalfBath', 'BedroomAbvGr', 'KitchenAbvGr', 'TotRmsAbvGrd', 'Fireplaces', 'GarageYrBlt', 'GarageCars', 'GarageArea', 'WoodDeckSF', 'OpenPorchSF', 'EnclosedPorch', '3SsnPorch', 'ScreenPorch', 'PoolArea', 'MiscVal', 'MoSold', 'YrSold', 'SalePrice']
In [161]:
from matplotlib.pyplot import show 
for i in  df1.columns:
    if df1[i].dtypes==object:
        sns.boxplot(data=df1,x='SalePrice',y=i)
        print('SalePrice vs ',i)
        show()
    else:
        sns.scatterplot(data=df1,x='SalePrice',y=i)
        print('SalePrice vs ',i)
        show()
SalePrice vs  Id
SalePrice vs  MSSubClass
SalePrice vs  MSZoning
SalePrice vs  LotFrontage
SalePrice vs  LotArea
SalePrice vs  Street
SalePrice vs  Alley
SalePrice vs  LotShape
SalePrice vs  LandContour
SalePrice vs  Utilities
SalePrice vs  LotConfig
SalePrice vs  LandSlope
SalePrice vs  Neighborhood
SalePrice vs  Condition1
SalePrice vs  Condition2
SalePrice vs  BldgType
SalePrice vs  HouseStyle
SalePrice vs  OverallQual
SalePrice vs  OverallCond
SalePrice vs  YearBuilt
SalePrice vs  YearRemodAdd
SalePrice vs  RoofStyle
SalePrice vs  RoofMatl
SalePrice vs  Exterior1st
SalePrice vs  Exterior2nd
SalePrice vs  MasVnrType
SalePrice vs  MasVnrArea
SalePrice vs  ExterQual
SalePrice vs  ExterCond
SalePrice vs  Foundation
SalePrice vs  BsmtQual
SalePrice vs  BsmtCond
SalePrice vs  BsmtExposure
SalePrice vs  BsmtFinType1
SalePrice vs  BsmtFinSF1
SalePrice vs  BsmtFinType2
SalePrice vs  BsmtFinSF2
SalePrice vs  BsmtUnfSF
SalePrice vs  TotalBsmtSF
SalePrice vs  Heating
SalePrice vs  HeatingQC
SalePrice vs  CentralAir
SalePrice vs  Electrical
SalePrice vs  1stFlrSF
SalePrice vs  2ndFlrSF
SalePrice vs  LowQualFinSF
SalePrice vs  GrLivArea
SalePrice vs  BsmtFullBath
SalePrice vs  BsmtHalfBath
SalePrice vs  FullBath
SalePrice vs  HalfBath
SalePrice vs  BedroomAbvGr
SalePrice vs  KitchenAbvGr
SalePrice vs  KitchenQual
SalePrice vs  TotRmsAbvGrd
SalePrice vs  Functional
SalePrice vs  Fireplaces
SalePrice vs  FireplaceQu
SalePrice vs  GarageType
SalePrice vs  GarageYrBlt
SalePrice vs  GarageFinish
SalePrice vs  GarageCars
SalePrice vs  GarageArea
SalePrice vs  GarageQual
SalePrice vs  GarageCond
SalePrice vs  PavedDrive
SalePrice vs  WoodDeckSF
SalePrice vs  OpenPorchSF
SalePrice vs  EnclosedPorch
SalePrice vs  3SsnPorch
SalePrice vs  ScreenPorch
SalePrice vs  PoolArea
SalePrice vs  PoolQC
SalePrice vs  Fence
SalePrice vs  MiscFeature
SalePrice vs  MiscVal
SalePrice vs  MoSold
SalePrice vs  YrSold
SalePrice vs  SaleType
SalePrice vs  SaleCondition
SalePrice vs  SalePrice
In [162]:
plt.figure(figsize=(15,6))
corr = df1.corr()
sns.heatmap(corr, annot=True)
plt.show()

Feature selection and Train Test Split¶

Forward selection¶

In [163]:
df1.columns
Out[163]:
Index(['Id', 'MSSubClass', 'MSZoning', 'LotFrontage', 'LotArea', 'Street',
       'Alley', 'LotShape', 'LandContour', 'Utilities', 'LotConfig',
       'LandSlope', 'Neighborhood', 'Condition1', 'Condition2', 'BldgType',
       'HouseStyle', 'OverallQual', 'OverallCond', 'YearBuilt', 'YearRemodAdd',
       'RoofStyle', 'RoofMatl', 'Exterior1st', 'Exterior2nd', 'MasVnrType',
       'MasVnrArea', 'ExterQual', 'ExterCond', 'Foundation', 'BsmtQual',
       'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinSF1',
       'BsmtFinType2', 'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF', 'Heating',
       'HeatingQC', 'CentralAir', 'Electrical', '1stFlrSF', '2ndFlrSF',
       'LowQualFinSF', 'GrLivArea', 'BsmtFullBath', 'BsmtHalfBath', 'FullBath',
       'HalfBath', 'BedroomAbvGr', 'KitchenAbvGr', 'KitchenQual',
       'TotRmsAbvGrd', 'Functional', 'Fireplaces', 'FireplaceQu', 'GarageType',
       'GarageYrBlt', 'GarageFinish', 'GarageCars', 'GarageArea', 'GarageQual',
       'GarageCond', 'PavedDrive', 'WoodDeckSF', 'OpenPorchSF',
       'EnclosedPorch', '3SsnPorch', 'ScreenPorch', 'PoolArea', 'PoolQC',
       'Fence', 'MiscFeature', 'MiscVal', 'MoSold', 'YrSold', 'SaleType',
       'SaleCondition', 'SalePrice'],
      dtype='object')
In [164]:
df1.shape
Out[164]:
(1460, 81)
In [165]:
X = df1.drop(labels ='SalePrice',axis=1)
Y = df1[['SalePrice']]
In [166]:
from ML_codes import cat_con
In [167]:
cat_con(X)
categorical features : ['MSZoning', 'Street', 'Alley', 'LotShape', 'LandContour', 'Utilities', 'LotConfig', 'LandSlope', 'Neighborhood', 'Condition1', 'Condition2', 'BldgType', 'HouseStyle', 'RoofStyle', 'RoofMatl', 'Exterior1st', 'Exterior2nd', 'MasVnrType', 'ExterQual', 'ExterCond', 'Foundation', 'BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2', 'Heating', 'HeatingQC', 'CentralAir', 'Electrical', 'KitchenQual', 'Functional', 'FireplaceQu', 'GarageType', 'GarageFinish', 'GarageQual', 'GarageCond', 'PavedDrive', 'PoolQC', 'Fence', 'MiscFeature', 'SaleType', 'SaleCondition']
continues features : ['Id', 'MSSubClass', 'LotFrontage', 'LotArea', 'OverallQual', 'OverallCond', 'YearBuilt', 'YearRemodAdd', 'MasVnrArea', 'BsmtFinSF1', 'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF', '1stFlrSF', '2ndFlrSF', 'LowQualFinSF', 'GrLivArea', 'BsmtFullBath', 'BsmtHalfBath', 'FullBath', 'HalfBath', 'BedroomAbvGr', 'KitchenAbvGr', 'TotRmsAbvGrd', 'Fireplaces', 'GarageYrBlt', 'GarageCars', 'GarageArea', 'WoodDeckSF', 'OpenPorchSF', 'EnclosedPorch', '3SsnPorch', 'ScreenPorch', 'PoolArea', 'MiscVal', 'MoSold', 'YrSold']
In [168]:
xcat = ['MSZoning', 'Street', 'Alley', 'LotShape', 'LandContour', 'Utilities', 'LotConfig', 'LandSlope', 'Neighborhood', 'Condition1', 'Condition2', 'BldgType', 'HouseStyle', 'RoofStyle', 'RoofMatl', 'Exterior1st', 'Exterior2nd', 'MasVnrType', 'ExterQual', 'ExterCond', 'Foundation', 'BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2', 'Heating', 'HeatingQC', 'CentralAir', 'Electrical', 'KitchenQual', 'Functional', 'FireplaceQu', 'GarageType', 'GarageFinish', 'GarageQual', 'GarageCond', 'PavedDrive', 'PoolQC', 'Fence', 'MiscFeature', 'SaleType', 'SaleCondition']
xcon = ['Id', 'MSSubClass', 'LotFrontage', 'LotArea', 'OverallQual', 'OverallCond', 'YearBuilt', 'YearRemodAdd', 'MasVnrArea', 'BsmtFinSF1', 'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF', '1stFlrSF', '2ndFlrSF', 'LowQualFinSF', 'GrLivArea', 'BsmtFullBath', 'BsmtHalfBath', 'FullBath', 'HalfBath', 'BedroomAbvGr', 'KitchenAbvGr', 'TotRmsAbvGrd', 'Fireplaces', 'GarageYrBlt', 'GarageCars', 'GarageArea', 'WoodDeckSF', 'OpenPorchSF', 'EnclosedPorch', '3SsnPorch', 'ScreenPorch', 'PoolArea', 'MiscVal', 'MoSold', 'YrSold']
In [169]:
Xcat = X[xcat]
Xcon = X[xcon]
In [170]:
from sklearn.preprocessing import StandardScaler , LabelEncoder
In [171]:
le = LabelEncoder()
ss= StandardScaler()
In [172]:
Xcon = pd.DataFrame(ss.fit_transform(Xcon),columns=xcon)
In [173]:
for i in Xcat.columns:
    Xcat[i]=le.fit_transform(Xcat[i])
In [174]:
X = Xcon.join(Xcat)
X
Out[174]:
Id MSSubClass LotFrontage LotArea OverallQual OverallCond YearBuilt YearRemodAdd MasVnrArea BsmtFinSF1 ... GarageType GarageFinish GarageQual GarageCond PavedDrive PoolQC Fence MiscFeature SaleType SaleCondition
0 -1.730865 0.073375 -0.229372 -0.207142 0.651479 -0.517200 1.050994 0.878668 0.511418 0.575425 ... 1 1 4 4 2 2 2 2 8 4
1 -1.728492 -0.872563 0.451936 -0.091886 -0.071836 2.179628 0.156734 -0.429577 -0.574410 1.171992 ... 1 1 4 4 2 2 2 2 8 4
2 -1.726120 0.073375 -0.093110 0.073480 0.651479 -0.517200 0.984752 0.830215 0.323060 0.092907 ... 1 1 4 4 2 2 2 2 8 4
3 -1.723747 0.309859 -0.456474 -0.096897 0.651479 -0.517200 -1.863632 -0.720298 -0.574410 -0.499274 ... 5 2 4 4 2 2 2 2 8 0
4 -1.721374 0.073375 0.633618 0.375148 1.374795 -0.517200 0.951632 0.733308 1.364570 0.463568 ... 1 1 4 4 2 2 2 2 8 4
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
1455 1.721374 0.073375 -0.365633 -0.260560 -0.071836 -0.517200 0.918511 0.733308 -0.574410 -0.973018 ... 1 1 4 4 2 2 2 2 8 4
1456 1.723747 -0.872563 0.679039 0.266407 -0.071836 0.381743 0.222975 0.151865 0.084843 0.759659 ... 1 2 4 4 2 2 2 2 8 4
1457 1.726120 0.309859 -0.183951 -0.147810 0.651479 3.078570 -1.002492 1.024029 -0.574410 -0.369871 ... 1 1 4 4 2 2 0 2 8 4
1458 1.728492 -0.872563 -0.093110 -0.080160 -0.795151 0.381743 -0.704406 0.539493 -0.574410 -0.865548 ... 1 2 4 4 2 2 2 2 8 4
1459 1.730865 -0.872563 0.224833 -0.058112 -0.795151 0.381743 -0.207594 -0.962566 -0.574410 0.847389 ... 1 0 4 4 2 2 2 2 8 4

1460 rows × 80 columns

In [175]:
from sklearn.model_selection import train_test_split 
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score
lr = LinearRegression()
for i in Xcat.columns:
    xtrain,xtest,ytrain,ytest=train_test_split(X,Y,test_size=0.2)
    model = lr.fit(xtrain,ytrain)
    tr_pred = model.predict(xtrain)
    ts_pred = model.predict(xtest)
    tr_score = r2_score(tr_pred,ytrain)
    ts_score = r2_score(ts_pred,ytest)
    print('*******************************************************************************')
    print(i)
    print(tr_score)
    print(ts_score)
    print('*******************************************************************************')
*******************************************************************************
MSZoning
0.8245928164811406
-0.0034364018180486067
*******************************************************************************
*******************************************************************************
Street
0.8323826361168949
0.7480507038008104
*******************************************************************************
*******************************************************************************
Alley
0.8331163845143464
-0.0034364256743986132
*******************************************************************************
*******************************************************************************
LotShape
0.8287303174117688
0.7232099685486453
*******************************************************************************
*******************************************************************************
LandContour
0.8388665687788034
0.7547297018945995
*******************************************************************************
*******************************************************************************
Utilities
0.8330065947128444
-0.0034364261253896355
*******************************************************************************
*******************************************************************************
LotConfig
0.8309995238601476
0.7911845587987087
*******************************************************************************
*******************************************************************************
LandSlope
0.8326685848280592
0.7927872597414736
*******************************************************************************
*******************************************************************************
Neighborhood
0.8567026931925429
0.6763744936626719
*******************************************************************************
*******************************************************************************
Condition1
0.8471466244653384
0.7398085953763389
*******************************************************************************
*******************************************************************************
Condition2
0.8239208952549005
0.856764675437776
*******************************************************************************
*******************************************************************************
BldgType
0.8513357937798982
0.701900019505364
*******************************************************************************
*******************************************************************************
HouseStyle
0.825752321112229
0.7208575651255756
*******************************************************************************
*******************************************************************************
RoofStyle
0.8613188006753336
0.7041882187743602
*******************************************************************************
*******************************************************************************
RoofMatl
0.8398478650527693
0.7920606437092762
*******************************************************************************
*******************************************************************************
Exterior1st
0.8228921585322415
-0.003436425811298438
*******************************************************************************
*******************************************************************************
Exterior2nd
0.8258299029188175
-0.0034364242254396604
*******************************************************************************
*******************************************************************************
MasVnrType
0.8258023173644041
-0.0034364259947012865
*******************************************************************************
*******************************************************************************
ExterQual
0.8216456911996279
0.7989069296463808
*******************************************************************************
*******************************************************************************
ExterCond
0.8320259102879335
0.7867431215452461
*******************************************************************************
*******************************************************************************
Foundation
0.8081730410516667
0.8738180491262793
*******************************************************************************
*******************************************************************************
BsmtQual
0.8708053823718767
0.6159064925774724
*******************************************************************************
*******************************************************************************
BsmtCond
0.7260000164358311
0.6337607070898481
*******************************************************************************
*******************************************************************************
BsmtExposure
0.8315600084616039
0.8229570211506989
*******************************************************************************
*******************************************************************************
BsmtFinType1
0.8555399599582174
-0.0034364258467534103
*******************************************************************************
*******************************************************************************
BsmtFinType2
0.8526148436627388
0.6908442588139856
*******************************************************************************
*******************************************************************************
Heating
0.827726618472953
-0.0034364260857298046
*******************************************************************************
*******************************************************************************
HeatingQC
0.8384321028654471
0.7817050241808596
*******************************************************************************
*******************************************************************************
CentralAir
0.8623401173554499
-0.003436426026917072
*******************************************************************************
*******************************************************************************
Electrical
0.7560606535151301
-0.003436426129582504
*******************************************************************************
*******************************************************************************
KitchenQual
0.8041483771591312
0.8330381314464508
*******************************************************************************
*******************************************************************************
Functional
0.8251760056263534
0.8651374435085786
*******************************************************************************
*******************************************************************************
FireplaceQu
0.865181299860468
0.44019070916043224
*******************************************************************************
*******************************************************************************
GarageType
0.8287570707875284
0.7973805809506005
*******************************************************************************
*******************************************************************************
GarageFinish
0.8296862769290205
0.7868122627360405
*******************************************************************************
*******************************************************************************
GarageQual
0.8618434651635145
0.6779404508929997
*******************************************************************************
*******************************************************************************
GarageCond
0.8292790674935437
-0.0034364261283077457
*******************************************************************************
*******************************************************************************
PavedDrive
0.8316027413126958
0.8363955577723898
*******************************************************************************
*******************************************************************************
PoolQC
0.8672234867360442
0.6725287314006727
*******************************************************************************
*******************************************************************************
Fence
0.8232859369540062
0.8142217391965945
*******************************************************************************
*******************************************************************************
MiscFeature
0.8242903876272583
0.8714713567018831
*******************************************************************************
*******************************************************************************
SaleType
0.8921378732760515
0.6065871712085489
*******************************************************************************
*******************************************************************************
SaleCondition
0.8631777827889595
-0.003436425851724101
*******************************************************************************

Backward Elimination¶

In [176]:
from statsmodels.api import add_constant,OLS
ols=OLS(ytrain,add_constant(xtrain))
model=ols.fit()
model.summary()
Out[176]:
OLS Regression Results
Dep. Variable: SalePrice R-squared: 0.883
Model: OLS Adj. R-squared: 0.875
Method: Least Squares F-statistic: 106.8
Date: Sat, 20 Jul 2024 Prob (F-statistic): 0.00
Time: 17:40:45 Log-Likelihood: -13618.
No. Observations: 1168 AIC: 2.739e+04
Df Residuals: 1090 BIC: 2.779e+04
Df Model: 77
Covariance Type: nonrobust
coef std err t P>|t| [0.025 0.975]
const 3.503e+05 5.5e+04 6.371 0.000 2.42e+05 4.58e+05
Id 535.3849 877.600 0.610 0.542 -1186.592 2257.362
MSSubClass -4271.2452 1945.208 -2.196 0.028 -8088.020 -454.470
LotFrontage 1082.3021 1154.094 0.938 0.349 -1182.195 3346.799
LotArea 4443.0872 1008.894 4.404 0.000 2463.494 6422.681
OverallQual 1.158e+04 1680.831 6.889 0.000 8281.084 1.49e+04
OverallCond 6938.1428 1210.576 5.731 0.000 4562.820 9313.466
YearBuilt 7119.7633 2581.639 2.758 0.006 2054.218 1.22e+04
YearRemodAdd -74.8490 1439.306 -0.052 0.959 -2898.972 2749.274
MasVnrArea 6211.4388 1127.192 5.511 0.000 3999.728 8423.150
BsmtFinSF1 8623.0758 1118.914 7.707 0.000 6427.607 1.08e+04
BsmtFinSF2 325.7570 1303.708 0.250 0.803 -2232.304 2883.818
BsmtUnfSF -1127.9579 903.495 -1.248 0.212 -2900.745 644.829
TotalBsmtSF 7948.6429 1314.093 6.049 0.000 5370.206 1.05e+04
1stFlrSF 8813.5941 1554.109 5.671 0.000 5764.211 1.19e+04
2ndFlrSF 1.056e+04 1332.369 7.928 0.000 7948.870 1.32e+04
LowQualFinSF -1283.6941 950.247 -1.351 0.177 -3148.213 580.825
GrLivArea 1.514e+04 1365.572 11.087 0.000 1.25e+04 1.78e+04
BsmtFullBath 465.3895 1335.517 0.348 0.728 -2155.086 3085.865
BsmtHalfBath -547.2907 959.495 -0.570 0.569 -2429.957 1335.376
FullBath 1912.1815 1533.539 1.247 0.213 -1096.840 4921.203
HalfBath 936.5892 1315.974 0.712 0.477 -1645.540 3518.718
BedroomAbvGr -3913.1913 1440.390 -2.717 0.007 -6739.442 -1086.940
KitchenAbvGr -4675.1851 1174.369 -3.981 0.000 -6979.464 -2370.906
TotRmsAbvGrd 4725.4328 1999.426 2.363 0.018 802.273 8648.593
Fireplaces 3732.1940 1166.372 3.200 0.001 1443.606 6020.782
GarageYrBlt 1284.7988 1650.073 0.779 0.436 -1952.879 4522.477
GarageCars 3016.0192 2106.055 1.432 0.152 -1116.362 7148.401
GarageArea 2280.3784 2055.193 1.110 0.267 -1752.203 6312.960
WoodDeckSF 2215.5906 954.388 2.321 0.020 342.945 4088.237
OpenPorchSF -611.1213 990.312 -0.617 0.537 -2554.254 1332.012
EnclosedPorch -417.5214 980.060 -0.426 0.670 -2340.540 1505.497
3SsnPorch 845.1519 955.280 0.885 0.377 -1029.244 2719.548
ScreenPorch 2100.9254 894.602 2.348 0.019 345.589 3856.262
PoolArea 1232.3222 1219.260 1.011 0.312 -1160.041 3624.685
MiscVal -2833.7893 1438.043 -1.971 0.049 -5655.434 -12.144
MoSold -922.2999 882.367 -1.045 0.296 -2653.629 809.029
YrSold -408.6439 886.091 -0.461 0.645 -2147.281 1329.993
MSZoning -926.7597 1650.040 -0.562 0.574 -4164.375 2310.855
Street 2.14e+04 1.4e+04 1.529 0.127 -6062.804 4.89e+04
Alley -4495.0258 6053.999 -0.742 0.458 -1.64e+04 7383.784
LotShape -669.1901 683.982 -0.978 0.328 -2011.260 672.880
LandContour 2752.8194 1389.489 1.981 0.048 26.443 5479.196
Utilities -4.012e-11 1.69e-11 -2.377 0.018 -7.32e-11 -7e-12
LotConfig -205.9756 559.871 -0.368 0.713 -1304.522 892.571
LandSlope 3216.0627 3931.571 0.818 0.414 -4498.241 1.09e+04
Neighborhood 136.4865 159.160 0.858 0.391 -175.809 448.782
Condition1 -1833.1329 1012.618 -1.810 0.071 -3820.034 153.768
Condition2 -1.734e+04 3660.277 -4.737 0.000 -2.45e+04 -1.02e+04
BldgType -367.8936 1534.811 -0.240 0.811 -3379.412 2643.625
HouseStyle -604.2676 674.021 -0.897 0.370 -1926.794 718.259
RoofStyle 2248.4983 1171.016 1.920 0.055 -49.203 4546.199
RoofMatl 2895.9521 1451.630 1.995 0.046 47.647 5744.257
Exterior1st -1203.5881 531.296 -2.265 0.024 -2246.067 -161.109
Exterior2nd 742.0937 485.490 1.529 0.127 -210.506 1694.694
MasVnrType 6510.7830 1624.206 4.009 0.000 3323.859 9697.707
ExterQual -8946.7256 1983.295 -4.511 0.000 -1.28e+04 -5055.218
ExterCond 1264.1551 1318.998 0.958 0.338 -1323.907 3852.218
Foundation 1690.4702 1792.403 0.943 0.346 -1826.480 5207.420
BsmtQual -7301.9380 1475.000 -4.950 0.000 -1.02e+04 -4407.778
BsmtCond 2218.7610 1373.472 1.615 0.107 -476.186 4913.708
BsmtExposure -2704.5450 922.616 -2.931 0.003 -4514.848 -894.242
BsmtFinType1 841.3309 665.720 1.264 0.207 -464.907 2147.569
BsmtFinType2 -460.8678 1458.224 -0.316 0.752 -3322.112 2400.376
Heating -1772.7638 3079.643 -0.576 0.565 -7815.463 4269.935
HeatingQC -450.6785 635.021 -0.710 0.478 -1696.680 795.323
CentralAir -4637.2940 4672.236 -0.993 0.321 -1.38e+04 4530.300
Electrical -635.1387 933.625 -0.680 0.496 -2467.045 1196.767
KitchenQual -8770.0820 1476.135 -5.941 0.000 -1.17e+04 -5873.694
Functional 4341.3901 966.594 4.491 0.000 2444.794 6237.986
FireplaceQu -1249.1164 1151.685 -1.085 0.278 -3508.886 1010.653
GarageType 698.9171 648.012 1.079 0.281 -572.575 1970.410
GarageFinish 565.7042 1506.910 0.375 0.707 -2391.068 3522.476
GarageQual -1079.9054 1795.605 -0.601 0.548 -4603.138 2443.327
GarageCond 2516.6083 2064.993 1.219 0.223 -1535.202 6568.418
PavedDrive 195.0864 2088.697 0.093 0.926 -3903.236 4293.408
PoolQC -3.596e+04 1.45e+04 -2.476 0.013 -6.45e+04 -7462.718
Fence 2049.6855 2087.469 0.982 0.326 -2046.226 6145.597
MiscFeature -3.973e+04 1.79e+04 -2.217 0.027 -7.49e+04 -4561.544
SaleType -85.9780 599.855 -0.143 0.886 -1262.979 1091.023
SaleCondition 2954.5572 860.978 3.432 0.001 1265.195 4643.920
Omnibus: 520.300 Durbin-Watson: 2.009
Prob(Omnibus): 0.000 Jarque-Bera (JB): 55517.418
Skew: -1.068 Prob(JB): 0.00
Kurtosis: 36.708 Cond. No. 1.01e+16


Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
[2] The smallest eigenvalue is 7.68e-27. This might indicate that there are
strong multicollinearity problems or that the design matrix is singular.
In [177]:
model.pvalues.sort_values(ascending=False)
Out[177]:
YearRemodAdd    9.585354e-01
PavedDrive      9.256022e-01
SaleType        8.860549e-01
BldgType        8.106082e-01
BsmtFinSF2      8.027353e-01
                    ...     
const           2.773057e-10
OverallQual     9.485603e-12
BsmtFinSF1      2.895225e-14
2ndFlrSF        5.475919e-15
GrLivArea       3.843453e-27
Length: 81, dtype: float64
In [178]:
c=model.pvalues.sort_values().index[-1]
X=X.drop(labels=c,axis=1)
xtrain,xtest,ytrain,ytest=train_test_split(X,Y,test_size=0.2,random_state=21)
ols=OLS(ytrain,add_constant(xtrain))
model=ols.fit()
score=round(model.rsquared_adj,3)
c=model.pvalues.sort_values().index[-1]
print(c)
print(score)
Fence
0.859
In [179]:
c=model.pvalues.sort_values().index[-1]
X=X.drop(labels=c,axis=1)
xtrain,xtest,ytrain,ytest=train_test_split(X,Y,test_size=0.2,random_state=21)
ols=OLS(ytrain,add_constant(xtrain))
model=ols.fit()
score=round(model.rsquared_adj,3)
c=model.pvalues.sort_values().index[-1]
print(c)
print(score)
Foundation
0.859
In [180]:
c=model.pvalues.sort_values().index[-1]
X=X.drop(labels=c,axis=1)
xtrain,xtest,ytrain,ytest=train_test_split(X,Y,test_size=0.2,random_state=21)
ols=OLS(ytrain,add_constant(xtrain))
model=ols.fit()
score=round(model.rsquared_adj,3)
c=model.pvalues.sort_values().index[-1]
print(c)
print(score)
FireplaceQu
0.859
In [181]:
c=model.pvalues.sort_values().index[-1]
X=X.drop(labels=c,axis=1)
xtrain,xtest,ytrain,ytest=train_test_split(X,Y,test_size=0.2,random_state=21)
ols=OLS(ytrain,add_constant(xtrain))
model=ols.fit()
score=round(model.rsquared_adj,3)
c=model.pvalues.sort_values().index[-1]
print(c)
print(score)
GarageType
0.859
In [182]:
c=model.pvalues.sort_values().index[-1]
X=X.drop(labels=c,axis=1)
xtrain,xtest,ytrain,ytest=train_test_split(X,Y,test_size=0.2,random_state=21)
ols=OLS(ytrain,add_constant(xtrain))
model=ols.fit()
score=round(model.rsquared_adj,3)
c=model.pvalues.sort_values().index[-1]
print(c)
print(score)
Condition2
0.859
In [183]:
c=model.pvalues.sort_values().index[-1]
X=X.drop(labels=c,axis=1)
xtrain,xtest,ytrain,ytest=train_test_split(X,Y,test_size=0.2,random_state=21)
ols=OLS(ytrain,add_constant(xtrain))
model=ols.fit()
score=round(model.rsquared_adj,3)
c=model.pvalues.sort_values().index[-1]
print(c)
print(score)
CentralAir
0.859
In [184]:
c=model.pvalues.sort_values().index[-1]
X=X.drop(labels=c,axis=1)
xtrain,xtest,ytrain,ytest=train_test_split(X,Y,test_size=0.2,random_state=21)
ols=OLS(ytrain,add_constant(xtrain))
model=ols.fit()
score=round(model.rsquared_adj,3)
c=model.pvalues.sort_values().index[-1]
print(c)
print(score)
GarageYrBlt
0.859
In [185]:
X.shape
Out[185]:
(1460, 73)

Model building >> Ridge¶

In [186]:
from sklearn.linear_model import Ridge
rr=Ridge(alpha=25) 
model=rr.fit(xtrain,ytrain)
tr_pred=model.predict(xtrain)
ts_pred=model.predict(xtest)
tr_score=r2_score(ytrain,tr_pred)
ts_score=r2_score(ytest,ts_pred)
In [187]:
print(tr_score)
print(ts_score)
0.8601660549303662
0.8035245219621637

Model building >> Lasso¶

In [188]:
from sklearn.linear_model import Lasso
l1=Lasso(alpha=0.5)
model=l1.fit(xtrain,ytrain)
tr_pred=model.predict(xtrain)
ts_pred=model.predict(xtest)
tr_score=r2_score(ytrain,tr_pred)
ts_score=r2_score(ytest,ts_pred)
print(tr_score)
print(ts_score)
0.867998361160488
0.799011737601639
In [189]:
w=[]
e=0.01
for i in range(0,1000,1):
    w.append(e)
    e=round(e+0.01,4)
In [190]:
from sklearn.metrics import  mean_squared_error
rr = Ridge(alpha=10)
model = rr.fit(xtrain,ytrain)
tr_pred = model.predict(xtrain)
ts_pred = model.predict(xtest)

tr_err=mean_squared_error(ytrain,tr_pred)
ts_err=mean_squared_error(ytest,ts_pred)

tr_score = r2_score(ytrain,tr_pred)
ts_score = r2_score(ytest,ts_pred)

print('traning score:',tr_score)
print('testing score:',ts_score)

print(tr_err)
print(ts_err)
traning score: 0.8628308466052411
testing score: 0.8019561153896838
848929674.1909673
1342205192.3466187

Evaluate model with MSE, RMSE, MAE, R2 , R2_adj¶

In [191]:
# MSE
tr_err=mean_squared_error(ytrain,tr_pred)
ts_err=mean_squared_error(ytest,ts_pred)
print(tr_err)
print(ts_err)
848929674.1909673
1342205192.3466187
In [192]:
# RMSE
import numpy as np 
training_rmse = np.sqrt(tr_err)
testing_rmse = np.sqrt(ts_err)
print(training_rmse)
print(testing_rmse)
29136.39775591635
36636.118685617046
In [193]:
# MAE
from sklearn.metrics import mean_absolute_error
tr_err=mean_absolute_error(ytrain,tr_pred)
ts_err=mean_absolute_error(ytest,ts_pred)
print(tr_err)
print(ts_err)
18304.01771012517
20162.481947226697
In [194]:
# R2 score
tr_score = r2_score(ytrain,tr_pred)
ts_score = r2_score(ytest,ts_pred)

print('traning score:',tr_score)
print('testing score:',ts_score)
traning score: 0.8628308466052411
testing score: 0.8019561153896838
In [196]:
# R2_adj 
n = len(ytrain)
p = len(xtrain.columns)
r2_adj = 1 - (1 - tr_score) * (n - 1) / (n - p - 1)
print(r2_adj)
0.8536778775030314
In [197]:
n = len(ytest)
p = len(xtest.columns)
r2_adj = 1 - (1 - ts_score) * (n - 1) / (n - p - 1)
print(r2_adj)
0.7356386677908164

Prediction of testing set¶

In [198]:
df2.head()
Out[198]:
Id MSSubClass MSZoning LotFrontage LotArea Street Alley LotShape LandContour Utilities ... ScreenPorch PoolArea PoolQC Fence MiscFeature MiscVal MoSold YrSold SaleType SaleCondition
0 1461 20 RH 80.0 11622 Pave NaN Reg Lvl AllPub ... 120 0 NaN MnPrv NaN 0 6 2010 WD Normal
1 1462 20 RL 81.0 14267 Pave NaN IR1 Lvl AllPub ... 0 0 NaN NaN Gar2 12500 6 2010 WD Normal
2 1463 60 RL 74.0 13830 Pave NaN IR1 Lvl AllPub ... 0 0 NaN MnPrv NaN 0 3 2010 WD Normal
3 1464 60 RL 78.0 9978 Pave NaN IR1 Lvl AllPub ... 0 0 NaN NaN NaN 0 6 2010 WD Normal
4 1465 120 RL 43.0 5005 Pave NaN IR1 HLS AllPub ... 144 0 NaN NaN NaN 0 1 2010 WD Normal

5 rows × 80 columns

In [199]:
from ML_codes import null_treat
null_treat(df2)
In [200]:
cat=[]
con=[]
for i in df2.columns:
    if df2[i].dtypes==object:
        cat.append(i)
    else:
        con.append(i)
print('categorical features :',cat)
print('-------------------------------')
print('continues features :',con)
categorical features : ['MSZoning', 'Street', 'Alley', 'LotShape', 'LandContour', 'Utilities', 'LotConfig', 'LandSlope', 'Neighborhood', 'Condition1', 'Condition2', 'BldgType', 'HouseStyle', 'RoofStyle', 'RoofMatl', 'Exterior1st', 'Exterior2nd', 'MasVnrType', 'ExterQual', 'ExterCond', 'Foundation', 'BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2', 'Heating', 'HeatingQC', 'CentralAir', 'Electrical', 'KitchenQual', 'Functional', 'FireplaceQu', 'GarageType', 'GarageFinish', 'GarageQual', 'GarageCond', 'PavedDrive', 'PoolQC', 'Fence', 'MiscFeature', 'SaleType', 'SaleCondition']
-------------------------------
continues features : ['Id', 'MSSubClass', 'LotFrontage', 'LotArea', 'OverallQual', 'OverallCond', 'YearBuilt', 'YearRemodAdd', 'MasVnrArea', 'BsmtFinSF1', 'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF', '1stFlrSF', '2ndFlrSF', 'LowQualFinSF', 'GrLivArea', 'BsmtFullBath', 'BsmtHalfBath', 'FullBath', 'HalfBath', 'BedroomAbvGr', 'KitchenAbvGr', 'TotRmsAbvGrd', 'Fireplaces', 'GarageYrBlt', 'GarageCars', 'GarageArea', 'WoodDeckSF', 'OpenPorchSF', 'EnclosedPorch', '3SsnPorch', 'ScreenPorch', 'PoolArea', 'MiscVal', 'MoSold', 'YrSold']
In [201]:
cat = ['MSZoning', 'Street', 'Alley', 'LotShape', 'LandContour', 'Utilities', 'LotConfig', 'LandSlope', 'Neighborhood', 'Condition1', 'Condition2', 'BldgType', 'HouseStyle', 'RoofStyle', 'RoofMatl', 'Exterior1st', 'Exterior2nd', 'MasVnrType', 'ExterQual', 'ExterCond', 'Foundation', 'BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2', 'Heating', 'HeatingQC', 'CentralAir', 'Electrical', 'KitchenQual', 'Functional', 'FireplaceQu', 'GarageType', 'GarageFinish', 'GarageQual', 'GarageCond', 'PavedDrive', 'PoolQC', 'Fence', 'MiscFeature', 'SaleType', 'SaleCondition']
con = ['Id', 'MSSubClass', 'LotFrontage', 'LotArea', 'OverallQual', 'OverallCond', 'YearBuilt', 'YearRemodAdd', 'MasVnrArea', 'BsmtFinSF1', 'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF', '1stFlrSF', '2ndFlrSF', 'LowQualFinSF', 'GrLivArea', 'BsmtFullBath', 'BsmtHalfBath', 'FullBath', 'HalfBath', 'BedroomAbvGr', 'KitchenAbvGr', 'TotRmsAbvGrd', 'Fireplaces', 'GarageYrBlt', 'GarageCars', 'GarageArea', 'WoodDeckSF', 'OpenPorchSF', 'EnclosedPorch', '3SsnPorch', 'ScreenPorch', 'PoolArea', 'MiscVal', 'MoSold', 'YrSold']
In [202]:
Xtcat = df2[cat]
Xtcon = df2[con]
In [203]:
Xtcon = pd.DataFrame(ss.fit_transform(Xtcon) , columns=con)
In [204]:
for i in Xtcat.columns:
    Xtcat[i]=le.fit_transform(Xtcat[i])
In [205]:
Xt = Xtcon.join(Xtcat)
xt = Xt.drop(labels=['Condition2', 'Fence', 'FireplaceQu', 'Foundation', 'GarageType', 'YearRemodAdd', 'CentralAir'], axis=1)
In [206]:
trained_features = xtrain.columns.tolist()
Xt = Xt[trained_features]

# Now you can make predictions
prob = model.predict(Xt)
print(prob[:5])
[[182978.1657155 ]
 [226412.13985591]
 [252956.70407142]
 [260503.68327767]
 [259828.84682967]]
In [207]:
preds = []
for i in prob:
    if i >= 0.5:
        preds.append(1)
    else:
        preds.append(0)
In [208]:
preds[:5]
Out[208]:
[1, 1, 1, 1, 1]

Building Sample Dataset of Output¶

In [209]:
df_result=df2[['Id']]
In [210]:
df_result['SalesPrice']=prob
In [211]:
df_result['probability']=preds
In [212]:
df_result.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1459 entries, 0 to 1458
Data columns (total 3 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   Id           1459 non-null   int64  
 1   SalesPrice   1459 non-null   float64
 2   probability  1459 non-null   int64  
dtypes: float64(1), int64(2)
memory usage: 34.3 KB
In [213]:
df_result.head()
Out[213]:
Id SalesPrice probability
0 1461 182978.165716 1
1 1462 226412.139856 1
2 1463 252956.704071 1
3 1464 260503.683278 1
4 1465 259828.846830 1
In [ ]:
 
In [ ]: